library(readxl)
library(readr)
library(seededlda)
library(quanteda)
library(tidyverse)
library(ggplot2)
library(scales)

# ----------------------------------------------------------
# Load data
# NOTE: Adjust the file paths below to match where you downloaded the data files.
# ----------------------------------------------------------
news_raw <- read_excel("PATH/TO/news_crawling.xlsx", col_names = FALSE)
lee_data <- read_csv("PATH/TO/Text_data_Lee.csv")
yoon_data <- read_csv("PATH/TO/Text_data_Yoon.csv")


# News DFM
text_data  <- news_data$...5 |> (\(z) z[!is.na(z)])() |> str_squish()
news_vec   <- vapply(str_split(text_data, ","), function(x) paste(x, collapse = " "), character(1))
dfm_news   <- dfm(tokens(corpus(news_vec), remove_punct = TRUE, remove_symbols = TRUE))

# Candidate DFMs (same preprocessing as before)
prep_vec <- function(x, stopwords){
  toks <- tokens(corpus(x), remove_punct = TRUE, remove_symbols = TRUE) |>
    tokens_remove(pattern = stopwords) |>
    tokens_select(pattern = "^[가-힣]{2,}$", valuetype = "regex", padding = FALSE)
  dfm(toks)
}
my_stopwords <- c(
  "이런","우리","저는","제가","지금","그런","하는","있는","때문에","그리고","그것","이제","합니다","그래서","즉","하지만","여러분",
  "말씀을","정말","이렇게","아까","하고","대해서","않습니까","것은","그런데","말씀","이미","그렇게","있습니다","많이","사실","너무",
  "하겠습니다","매우","번째","있는데","해서","한번","생각합니다","됩니다","우리가","겁니다","다른","저희가","말씀하신","것이","이게",
  "것처럼","하면","한다","드립니다","없는","정말로","저도","아니고","생각이","내가","통해서","먼저","가지고","된다","가장","위한",
  "어떻게","같습니다","되는","또는","듭니다","좋은","예를","그걸","가지","알겠습니다","드리고","해야","없습니다","계속","일단","제대로",
  "있다","그건","건지","여쭤보겠습니다","아니","사실은","맞습니다","전적으로","많은","드리고요","하지","같아요","결국","답을","식으로",
  "아마","구체적으로","이번","선택을","가지는","싶습니다","그러나","한다는","그때","저한테","그게","자체를","그때","같은","당연히",
  "있게","가능한","지금도","이해가","싶은","그러니까","특히","거죠","지금은","아닌","아니라","된다는","때는","있다고","만들고","그렇죠",
  "아주","거의","이유는","혹시","다시","바로","하게","드리는","중에","피해를","되는데","똑같은","확실하게","말씀드리겠습니다",
  "그거는","같아요","있으면","같은","그게","정도","이런","이제","다시","조금","이거","저희","일단","왜냐하면","그런데","그래서",
  "그러니까","이렇습니다","그렇습니다","있습니다","없습니다","합니다","합시다","했는데","됐습니다","있어요","없어요","어떤",
  "하셨는데","되면","후보께서","것을","얘기를","글쎄","보면","여러","그러면","거기에","아니겠습니까","대한","이거를","알고","있고",
  "것입니다","결국은","된다고","것이고","한다고","대해서는","보니까","이거는","거는","것도","나온","되고","하나","번째로","않고",
  "하는데","것이고","말씀드립니다","전에","있다는","얼마든지","말씀은","후보님","아닙니다","없다","원래","들면","위해서","하기","드린","나중에","이야기를"
)
lee_vec  <- tolower(lee_data$remarks)
yoon_vec <- tolower(yoon_data$remarks)
lee_dfm  <- prep_vec(lee_vec,  my_stopwords)
yoon_dfm <- prep_vec(yoon_vec, my_stopwords)

# Keywords (unchanged order/content)
keywords <- list(
  economy = c("경제","기본소득","소득","국채","세금","증세","이자율","가계부채","구조조정","시장","재정","통화"),
  budget  = c("예산","재원","발행","지원금","재난지원금","손실보상법","불균형","현금"),
  gender  = c("여성","여자","페미니즘","성평등","성차별","젠더","하사","중사","다양성","성범죄"),
  security_foreign = c("안보","평화","전쟁","미사일","우크라이나","전술핵","북한","동맹","침공","나토","젤렌스키","러시아","외교","국방","한미일","방위산업","군사적"),
  welfare = c("청년","저출산","아이","일자리","지원","복지","교육","안전망","기본소득","장애인"),
  daejangdong = c("대장동","김만배","녹취록","수사","화천대유","도시개발공사","공소장","게이트"),
  electoral_reform = c("위성정당","연동형","선거법","개헌","선거제도","개혁","합당"),
  party_politics   = c("정당","민주당","국민의힘","정의당","국민의당"),
  policy_general   = c("정책","정책에","정책으로","정책을","정책이","정책의","공약","공약을","공약으로")
)

# Models (same seeds/hyperparameters)
set.seed(123)
seeded_news <- textmodel_seededlda(dfm_news, dictionary(keywords), k = length(keywords), alpha = 0.1, beta = 0.01)
set.seed(123)
seeded_cand <- textmodel_seededlda(rbind(lee_dfm, yoon_dfm), dictionary(keywords), k = length(keywords), alpha = 0.1, beta = 0.01)
set.seed(123)
seeded_all  <- textmodel_seededlda(rbind(dfm_news, lee_dfm, yoon_dfm), dictionary(keywords), k = length(keywords), alpha = 0.1, beta = 0.01)


# ---- (Korean font) ----
suppressPackageStartupMessages({
  library(showtext); library(sysfonts)
})
if (!"noto" %in% sysfonts::font_families()) {
  sysfonts::font_add_google("Noto Sans KR", "noto")
}
showtext_auto(enable = TRUE)

# Appendix–Figure A2
seeded_top_words <- function(mod, m = 6) {
  tt <- try(quanteda::terms(mod, m), silent = TRUE)
  if (!inherits(tt, "try-error")) return(tt)
  phi <- mod$phi; vn <- colnames(phi); if (is.null(vn)) vn <- rownames(phi)
  if (ncol(phi) == ncol(mod$theta)) {
    if (nrow(phi) == length(vn)) {
      ord <- apply(phi, 2, function(x) order(x, decreasing = TRUE)[seq_len(min(m, length(x)))])
      sapply(seq_len(ncol(phi)), function(j) vn[ord[, j]])
    } else {
      phit <- t(phi)
      ord <- apply(phit, 2, function(x) order(x, decreasing = TRUE)[seq_len(min(m, length(x)))])
      sapply(seq_len(ncol(phit)), function(j) colnames(phi)[ord[, j]])
    }
  } else {
    if (nrow(phi) == ncol(mod$theta)) phi <- t(phi)
    if (is.null(colnames(phi))) colnames(phi) <- paste0("Topic_", seq_len(ncol(phi)))
    vn <- rownames(phi)
    ord <- apply(phi, 2, function(x) order(x, decreasing = TRUE)[seq_len(min(m, length(x)))])
    sapply(seq_len(ncol(phi)), function(j) vn[ord[, j]])
  }
}

plot_seeded_topic_props <- function(mod, m_top = 6, title = "Topic Proportions", topic_order = NULL) {
  theta <- mod$theta
  tn <- colnames(theta); if (is.null(tn)) tn <- paste0("Topic_", seq_len(ncol(theta)))
  colnames(theta) <- tn
  mean_theta <- colMeans(theta)
  tm <- seeded_top_words(mod, m_top)
  if (ncol(tm) != length(tn)) {
    if (!is.null(colnames(tm)) && all(colnames(tm) %in% tn)) tm <- tm[, tn, drop = FALSE]
    else if (nrow(tm) == length(tn)) tm <- t(tm)
    else tm <- tm[, seq_len(length(tn)), drop = FALSE]
  }
  labs <- vapply(seq_along(tn), function(j) paste(tm[, j], collapse = ", "), character(1))
  df <- tibble(Topic = tn, theta = as.numeric(mean_theta), label = labs)
  df$Topic <- factor(df$Topic, levels = if (is.null(topic_order)) rev(sort(unique(df$Topic))) else rev(topic_order))
  pad <- max(df$theta) * 0.06; x_lab <- df$theta + pad * 3; x_max <- max(x_lab) + pad * 2
  ggplot(df, aes(y = Topic, x = theta, fill = Topic)) +
    geom_col(width = 0.75, show.legend = FALSE) +
    geom_text(aes(x = x_lab, label = label), hjust = 0, size = 3.4) +
    scale_x_continuous(labels = percent_format(accuracy = 1), limits = c(0, x_max), expand = expansion(mult = c(0, 0.02))) +
    labs(title = title, x = "Expected topic proportions", y = "Topic") +
    theme_minimal(base_size = 12) +
    theme(plot.title = element_text(face = "bold", hjust = 0.5), axis.text.y = element_text(size = 11), plot.margin = margin(10, 140, 10, 10)) +
    scale_fill_grey(start = 0.85, end = 0.25)
}

# A2: use the all-documents model, keep keyword order
p_A2 <- plot_seeded_topic_props(seeded_all, m_top = 6, title = "Appendix–Figure A2: Topic Proportions (All Documents)", topic_order = names(keywords))
print(p_A2)


# Appendix–Figure A3
# Split candidate rows from seeded_cand$theta using original DFMs
n_lee  <- ndoc(lee_dfm)
n_yoon <- ndoc(yoon_dfm)

theta_news <- seeded_news$theta
theta_cand <- rbind(seeded_cand$theta[1:n_lee, , drop = FALSE],
                    seeded_cand$theta[(n_lee+1):(n_lee+n_yoon), , drop = FALSE])

colnames(theta_news) <- colnames(theta_news) %||% names(keywords)
colnames(theta_cand) <- colnames(theta_cand) %||% names(keywords)

bayes_boot_diff <- function(x_treat, x_ctrl, B = 5000L){
  nt <- length(x_treat); nc <- length(x_ctrl)
  sims <- replicate(B, {
    wt <- rgamma(nt, 1); wt <- wt/sum(wt)
    wc <- rgamma(nc, 1); wc <- wc/sum(wc)
    sum(wt * x_treat) - sum(wc * x_ctrl)
  })
  tibble(post_mean = mean(sims),
         post_median = median(sims),
         l95 = quantile(sims, 0.025),
         u95 = quantile(sims, 0.975),
         p_gt0 = mean(sims > 0))
}

set.seed(123)
res_A3 <- map_dfr(colnames(theta_cand), function(tp){
  out <- bayes_boot_diff(theta_cand[, tp], theta_news[, tp], B = 5000L)
  mutate(out, Topic = tp, .before = 1)
}) |>
  arrange(desc(abs(post_mean))) |>
  mutate(sig95 = if_else(l95 * u95 > 0, "Credible (95%)", "Not credible"))

ggplot(res_A3, aes(x = reorder(Topic, post_mean), y = post_mean)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_point() +
  geom_errorbar(aes(ymin = l95, ymax = u95), width = 0.15) +
  coord_flip() +
  labs(title = "Appendix–Figure A3: Difference in Topic Proportions (Candidates − News)",
       x = "Topic", y = "Mean difference (95% CrI)") +
  theme_minimal()




